Food Safety Classification Model¶

Here is a project to promote the food safety in Chicago. My goal is to create classification model, predicting the outcome of food safety inspection based on the inspectors’ comments.

In [2]:
import pandas as pd
import requests
import re
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
In [3]:
%%time

# Define the API endpoint and parameters
url = "https://data.cityofchicago.org/resource/cwig-ma7x.json"

# Fetch the total count of records
def get_total_count(url):
    params = {
        "$select": "count(*)"
    }
    response = requests.get(url, params=params)
    data = response.json()
    return int(data[0]['count'])

total_count = get_total_count(url)
print(f"Total number of records available: {total_count:,.0f}")
Total number of records available: 267,124
CPU times: total: 46.9 ms
Wall time: 584 ms
In [4]:
%%time

# Fetch data and load it into a pandas DataFrame
def fetch_data(url, params):
    response = requests.get(url, params=params)
    data = response.json()
    return pd.DataFrame(data)

# Set the limit parameter equal to the number of available records
params = {
    "$limit": total_count
}

# Fetch the data and load it into a DataFrame
df = fetch_data(url, params)

print(f'Number of records retrieved: {df.shape[0]:,.0f}')
Number of records retrieved: 267,124
CPU times: total: 4.75 s
Wall time: 27.4 s
In [5]:
df.head(5)
Out[5]:
inspection_id dba_name aka_name license_ facility_type risk address city state zip ... results violations latitude longitude location :@computed_region_awaf_s7ux :@computed_region_6mkv_f3dw :@computed_region_vrxf_vc4k :@computed_region_bdys_3d7i :@computed_region_43wa_7qmu
0 2588891 THE LANZAGA THE LANZAGA 2951700 Restaurant Risk 1 (High) 3734-3736 N BROADWAY CHICAGO IL 60613 ... Fail 5. PROCEDURES FOR RESPONDING TO VOMITING AND D... 41.950232386786 -87.64918094440476 {'type': 'Point', 'coordinates': [-87.64918094... 37 21186 57 726 39
1 2588914 SUBWAY SUBWAY 2262950 Restaurant Risk 1 (High) 2008 N HALSTED ST CHICAGO IL 60614 ... Pass 51. PLUMBING INSTALLED; PROPER BACKFLOW DEVICE... 41.91849213778706 -87.64866909416301 {'type': 'Point', 'coordinates': [-87.64866909... 51 21190 68 744 34
2 2588885 CARNICERIA Y FRUTERIA EL MILAGRITO CARNICERIA Y FRUTERIA EL MILAGRITO 2930227 Grocery Store Risk 2 (Medium) 5960 W LAWRENCE AVE CHICAGO IL 60630 ... Pass 54. GARBAGE & REFUSE PROPERLY DISPOSED; FACILI... 41.96776480013477 -87.777020204254 {'type': 'Point', 'coordinates': [-87.77702020... 20 21869 15 94 50
3 2588881 JET'S PIZZA JET'S PIZZA 2522268 Restaurant Risk 1 (High) 1025 W MADISON ST CHICAGO IL 60607 ... Pass NaN 41.88157249576794 -87.65305233593274 {'type': 'Point', 'coordinates': [-87.65305233... 48 14917 29 91 26
4 2588848 HO FAT LEE CHINESE KITCHEN, I NC. HO FAT LEE CHINESE KITCHEN 1543266 Restaurant Risk 1 (High) 1114 S KEDZIE AVE CHICAGO IL 60612 ... Pass NaN 41.86778016933903 -87.70585900858474 {'type': 'Point', 'coordinates': [-87.70585900... 36 21184 30 98 14

5 rows × 22 columns

Preprocessing¶

In [6]:
clean_data = df.copy()
clean_data.dropna(subset = ['violations'], inplace=True)
def desc(x):
    pattern = r"\d+\. (.*?)(?: - Comments:|$)"
    parts = re.split(r'\|', x)
    res = []
    for part in parts:
        res = res + re.findall(pattern,part)
    return res

def comments(x):
    parts = re.split(r'\|',x)
    pattern = r'- Comments: (.*)'
    res = []
    for part in parts:
        res = res + re.findall(pattern,part)
    return res

clean_data["comments"] = clean_data.loc[:,"violations"].map(comments)
In [7]:
import nltk
import nltk.corpus  
from nltk.text import Text

# Ensure NLTK resources are downloaded (e.g., tokenizers, corpora)
nltk.download('punkt')

clean_data["comments_tokenized"] = clean_data["comments"].map(lambda x:[nltk.tokenize.word_tokenize(i) for i in x])
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\10124\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
In [8]:
nltk.download('stopwords')
from nltk.corpus import stopwords

def clean_tokens(tokens_list):
    cleaned_tokens_list = []
    # Assuming tokens_list is a list of lists
    for tokens in tokens_list:
        # Convert to lower case
        tokens = [token.lower() for token in tokens]
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words]
        # Remove punctuation & numbers (keep only alphabetic tokens)
        tokens = [token for token in tokens if token.isalpha()]
        cleaned_tokens_list.append(tokens)
    return cleaned_tokens_list

# Assuming clean_data['comments_tokenized'] is a list of lists
clean_data['comments_tokenized_clean'] = clean_data['comments_tokenized'].apply(lambda x: clean_tokens(x) if isinstance(x, list) and all(isinstance(lst, list) for lst in x) else [])
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\10124\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
In [ ]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

wnl = WordNetLemmatizer()

def flatten_and_lemmatize(tokens_list_of_lists):
    # Flatten the list of lists into a single list
    flattened_list = [token for sublist in tokens_list_of_lists for token in sublist]
    # Lemmatize each token in the flattened list
    lemmatized_tokens = [wnl.lemmatize(token) for token in flattened_list]
    return lemmatized_tokens

# Apply the function to the column that contains a list of lists
clean_data['comments_tokenized_flatten_lemma'] = clean_data['comments_tokenized_clean'].apply(flatten_and_lemmatize)
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\10124\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
In [11]:
clean_data['comments_final'] = clean_data['comments_tokenized_flatten_lemma'].apply(lambda x: ' '.join(map(str, x)))
In [11]:
clean_data.head()
Out[11]:
inspection_id dba_name aka_name license_ facility_type risk address city state zip ... :@computed_region_awaf_s7ux :@computed_region_6mkv_f3dw :@computed_region_vrxf_vc4k :@computed_region_bdys_3d7i :@computed_region_43wa_7qmu comments comments_tokenized comments_tokenized_clean comments_tokenized_flatten_lemma comments_final
0 2588891 THE LANZAGA THE LANZAGA 2951700 Restaurant Risk 1 (High) 3734-3736 N BROADWAY CHICAGO IL 60613 ... 37 21186 57 726 39 [NO WRITTEN CLEANING PROCEDURE OR REQUIRED EQU... [[NO, WRITTEN, CLEANING, PROCEDURE, OR, REQUIR... [[written, cleaning, procedure, required, equi... [written, cleaning, procedure, required, equip... written cleaning procedure required equipment ...
1 2588914 SUBWAY SUBWAY 2262950 Restaurant Risk 1 (High) 2008 N HALSTED ST CHICAGO IL 60614 ... 51 21190 68 744 34 [ 5-205.15--- OBSERVED LEAKING AT THE FAUCET B... [[5-205.15, --, -, OBSERVED, LEAKING, AT, THE,... [[observed, leaking, faucet, base, rear, sink,... [observed, leaking, faucet, base, rear, sink, ... observed leaking faucet base rear sink must re...
2 2588885 CARNICERIA Y FRUTERIA EL MILAGRITO CARNICERIA Y FRUTERIA EL MILAGRITO 2930227 Grocery Store Risk 2 (Medium) 5960 W LAWRENCE AVE CHICAGO IL 60630 ... 20 21869 15 94 50 [OBSERVED BOTH EXPOSED HANDSINKS IN BUTCHER PR... [[OBSERVED, BOTH, EXPOSED, HANDSINKS, IN, BUTC... [[observed, exposed, handsinks, butcher, prep,... [observed, exposed, handsinks, butcher, prep, ... observed exposed handsinks butcher prep area w...
5 2588819 COMMON DECENCY FEVER DREAM COMMON DECENY FEVER DREAM 2944240 Restaurant Risk 1 (High) 3152-3154 W DIVERSEY AVE CHICAGO IL 60647 ... 15 22535 22 465 20 [FOUND NO EMPLOYEE HEALTH POLICY AT PREMISES.P... [[FOUND, NO, EMPLOYEE, HEALTH, POLICY, AT, PRE... [[found, employee, health, policy, citation, p... [found, employee, health, policy, citation, pr... found employee health policy citation provide ...
6 2588828 TIKAL SABOR CHAPIN TIKAL SABOR CHAPIN 2951612 Restaurant Risk 1 (High) 3216 W LAWRENCE AVE CHICAGO IL 60625 ... 28 21849 14 750 20 [OBSERVED NO WRITTEN EMPLOYEE HEALTH POLICY ON... [[OBSERVED, NO, WRITTEN, EMPLOYEE, HEALTH, POL... [[observed, written, employee, health, policy,... [observed, written, employee, health, policy, ... observed written employee health policy premis...

5 rows × 27 columns

In [12]:
clean_data["results"].unique()
Out[12]:
array(['Fail', 'Pass', 'Pass w/ Conditions', 'No Entry', 'Not Ready',
       'Out of Business'], dtype=object)

Build a classification model¶

predicting the outcome of inspection – comments are predictors, target variable is “Results” column

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn import metrics
In [14]:
violation = clean_data[["comments_final","results"]]
In [15]:
violation = violation[(violation["results"] == "Pass") | (violation["results"] == "Fail")]
violation.groupby("results").count()
Out[15]:
comments_final
results
Fail 48730
Pass 104467
In [16]:
violation = violation.groupby("results").sample(n=10000, random_state=1)
violation.reset_index(inplace=True, drop=True)
violation.groupby("results").count()
Out[16]:
comments_final
results
Fail 10000
Pass 10000
In [17]:
violation['result_flag'] = violation.results.map({'Fail':0, 'Pass':1})
In [18]:
X = violation.comments_final
y = violation.result_flag
In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
In [20]:
# Initialize CountVectorizer with the desired ngram range
vect = CountVectorizer(ngram_range=(1, 2))

X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)
In [21]:
print(X_train)
18960    metal walk cooler shelf shelving rice cooker d...
11353    food contact surface equipment food storage ut...
7449     observed singed employee health policy site fo...
14308    broken exposed insulation liner inside white c...
11889    exterior grease trap dishroom showing sign det...
                               ...                        
10955    vent throughout clean instructed clean must cl...
17289    observed rubber gasket door door cooler prep a...
5192     found front exit door inch gap glass door fron...
12172    must defrost deep freezer basement instructed ...
235      noted broken knob exposed hand wash sink cold ...
Name: comments_final, Length: 15000, dtype: object

Logistic Regression¶

In [22]:
logreg = LogisticRegression(max_iter=10000)
In [23]:
%time logreg.fit(X_train_dtm, y_train)
CPU times: total: 15.7 s
Wall time: 18.7 s
Out[23]:
LogisticRegression(max_iter=10000)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(max_iter=10000)
In [24]:
y_pred_class = logreg.predict(X_test_dtm)
In [25]:
logic_accuracy = metrics.accuracy_score(y_test, y_pred_class)
print(f"Test Accuracy: {logic_accuracy * 100:.1f}%")
Test Accuracy: 96.5%
In [26]:
print(classification_report(y_test, y_pred_class))
              precision    recall  f1-score   support

           0       0.98      0.95      0.97      2569
           1       0.95      0.98      0.96      2431

    accuracy                           0.96      5000
   macro avg       0.96      0.97      0.96      5000
weighted avg       0.97      0.96      0.96      5000

In [27]:
clf = logreg


feature_names = vect.get_feature_names_out()
coefs_with_fns = zip(feature_names, clf.coef_[0])

coefs_with_fns_df = pd.DataFrame(coefs_with_fns,
                    columns=['feature', 'coefficient'])
In [28]:
coefs_with_fns_df.sort_values(by='coefficient', ascending=True, inplace=True)
coefs_with_fns_df.head(10)
Out[28]:
feature coefficient
130675 serious -3.536678
35563 critical -2.011492
111673 priority -2.010268
22637 citation -1.933869
160045 violation -1.763725
130853 serious violation -1.504526
76401 issued -1.396010
35666 critical violation -1.261778
160309 violation observed -1.183781
9218 barrier -1.140754

Naive Bayes Model¶

In [29]:
# instantiate a Multinomial Naive Bayes model
nb = MultinomialNB()
In [30]:
# train and time the model using X_train_dtm
%time nb.fit(X_train_dtm, y_train)
CPU times: total: 31.2 ms
Wall time: 32 ms
Out[30]:
MultinomialNB()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
MultinomialNB()
In [31]:
# make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)
In [32]:
# calculate accuracy of class predictions
bayes_accuracy = metrics.accuracy_score(y_test, y_pred_class)
print(f"Test Accuracy: {bayes_accuracy * 100:.1f}%")
Test Accuracy: 90.0%
In [33]:
# calculate precision and recall
print(classification_report(y_test, y_pred_class))
              precision    recall  f1-score   support

           0       0.87      0.94      0.91      2569
           1       0.94      0.85      0.89      2431

    accuracy                           0.90      5000
   macro avg       0.90      0.90      0.90      5000
weighted avg       0.90      0.90      0.90      5000

In [34]:
# calculate the confusion matrix
print(metrics.confusion_matrix(y_test, y_pred_class))
[[2426  143]
 [ 356 2075]]
In [35]:
clf = nb  # Assuming 'nb' is an instance of MultinomialNB that's already been fitted

feature_names = vect.get_feature_names_out()
log_prob = clf.feature_log_prob_

# Since we're dealing with log probabilities, you can directly use 'log_prob'
# Assuming interest in the first class for demonstration
coefs_with_fns = zip(feature_names, log_prob[0])

coefs_with_fns_df = pd.DataFrame(coefs_with_fns, columns=['feature', 'log_probability'])
In [36]:
coefs_with_fns_df.sort_values(by='log_probability', ascending=False, inplace=True)
coefs_with_fns_df.head(10)
Out[36]:
feature log_probability
57449 food -4.472817
93479 must -4.477255
4647 area -4.613348
74781 instructed -4.642012
97701 observed -4.706363
23172 clean -4.833233
135100 sink -4.952810
55384 floor -5.028783
86684 maintain -5.048695
110000 prep -5.139704

SVM¶

In [37]:
# instantiate a SVM model
svm = SGDClassifier(max_iter=100, tol=None)
# train the model using X_train_dtm
%time svm.fit(X_train_dtm, y_train)
# make class predictions for X_test_dtm
y_pred_class = svm.predict(X_test_dtm)
CPU times: total: 562 ms
Wall time: 563 ms
In [38]:
# calculate accuracy of class predictions
svm_accuracy = metrics.accuracy_score(y_test, y_pred_class)
print(svm_accuracy)
0.9462
In [39]:
# calculate precision and recall
print(classification_report(y_test, y_pred_class))
# calculate the confusion matrix
print(metrics.confusion_matrix(y_test, y_pred_class))
              precision    recall  f1-score   support

           0       0.95      0.94      0.95      2569
           1       0.94      0.95      0.95      2431

    accuracy                           0.95      5000
   macro avg       0.95      0.95      0.95      5000
weighted avg       0.95      0.95      0.95      5000

[[2418  151]
 [ 118 2313]]
In [40]:
svm
Out[40]:
SGDClassifier(max_iter=100, tol=None)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SGDClassifier(max_iter=100, tol=None)
In [41]:
clf = svm


feature_names = vect.get_feature_names_out()
coefs_with_fns = zip(feature_names, clf.coef_[0])

coefs_with_fns_df = pd.DataFrame(coefs_with_fns,
                    columns=['feature', 'coefficient'])
In [42]:
coefs_with_fns_df.sort_values(by='coefficient', ascending=True, inplace=True)
coefs_with_fns_df.head(10)
Out[42]:
feature coefficient
130675 serious -4.530316
22637 citation -3.157897
160045 violation -3.091275
111673 priority -2.784812
35563 critical -2.564958
130853 serious violation -2.351767
113761 provide -2.318456
74062 install -2.265158
28702 connected -2.045304
87459 maintained -2.011993

Explain why selecting a particular text pre-processing technique¶

  1. Elimination of Stop Words: Commonly occurring words such as "and", "the", and "is" are often excluded from the analysis because they typically don't provide valuable insights for predictive models.

  2. Exclusion of Numbers and Single Characters: In instances where not every unstructured comment is linked to a specific code (for example, "4-601.11(C):"), I ensure uniformity by discarding numbers and individual letters, thereby omitting code references from consideration.

  3. Token Lemmatization: To achieve uniformity among tokens, I apply lemmatization, which adjusts tokens to their base forms, enhancing their comparability.

  4. N-grams selection: after hyperparameter pipeline I find that ngram range of (1,2) works better. This may be because that using bigrams (pairs of consecutive words) along with unigrams allows the model to capture more context and the relationship between words, leading to a better understanding of the text's meaning.

Build a pipeline with hyperparameter tuning to find out the best n-grams¶

In [43]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("clf", LogisticRegression()),
    ]
)
pipeline
Out[43]:
Pipeline(steps=[('vect', CountVectorizer()), ('clf', LogisticRegression())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('vect', CountVectorizer()), ('clf', LogisticRegression())])
CountVectorizer()
LogisticRegression()
In [44]:
parameter_grid = {
    'vect__max_df': (0.2, 0.4, 0.6, 0.8, 1.0),
    'vect__min_df': (1, 3, 5, 10),
    'vect__ngram_range': ((1, 1), (1, 2)),  # Unigrams or bigrams
    'clf__C': np.logspace(-4, 4, 9),  # Inverse of regularization strength
    'clf__penalty': ['l1', 'l2'],  # Type of regularization
    # 'clf__solver': ['liblinear', 'saga'],  # Solver, might be required if using 'l1' penalty
}
In [45]:
from pprint import pprint

from sklearn.model_selection import RandomizedSearchCV

random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=parameter_grid,
    n_iter=40,
    random_state=0,
    n_jobs=2,
    verbose=1,
)


print("Performing grid search...")
print("Hyperparameters to be evaluated:")
pprint(parameter_grid)
Performing grid search...
Hyperparameters to be evaluated:
{'clf__C': array([1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03,
       1.e+04]),
 'clf__penalty': ['l1', 'l2'],
 'vect__max_df': (0.2, 0.4, 0.6, 0.8, 1.0),
 'vect__min_df': (1, 3, 5, 10),
 'vect__ngram_range': ((1, 1), (1, 2))}
In [46]:
from time import time

t0 = time()
random_search.fit(X_train, y_train)
print(f"Done in {time() - t0:.3f}s")
Fitting 5 folds for each of 40 candidates, totalling 200 fits
D:\python\Lib\site-packages\sklearn\model_selection\_validation.py:425: FitFailedWarning: 
115 fits failed out of a total of 200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
115 fits failed with the following error:
Traceback (most recent call last):
  File "D:\python\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\python\Lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "D:\python\Lib\site-packages\sklearn\pipeline.py", line 427, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "D:\python\Lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "D:\python\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1169, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "D:\python\Lib\site-packages\sklearn\linear_model\_logistic.py", line 56, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  warnings.warn(some_fits_failed_message, FitFailedWarning)
D:\python\Lib\site-packages\sklearn\model_selection\_search.py:979: UserWarning: One or more of the test scores are non-finite: [       nan        nan        nan 0.92346667        nan        nan
 0.9522     0.83553333 0.96873333        nan        nan        nan
        nan 0.958      0.94546667        nan 0.95033333        nan
 0.95986667 0.95293333        nan        nan        nan 0.952
        nan 0.95126667        nan 0.8808            nan        nan
        nan 0.96033333        nan 0.931      0.9504            nan
        nan        nan 0.96726667 0.95      ]
  warnings.warn(
Done in 177.851s
D:\python\Lib\site-packages\sklearn\linear_model\_logistic.py:460: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
In [47]:
print("Best parameters combination found:")
best_parameters = random_search.best_estimator_.get_params()
for param_name in sorted(parameter_grid.keys()):
    print(f"{param_name}: {best_parameters[param_name]}")
Best parameters combination found:
clf__C: 1.0
clf__penalty: l2
vect__max_df: 0.4
vect__min_df: 1
vect__ngram_range: (1, 2)
In [48]:
test_accuracy = random_search.score(X_test, y_test)

print(
    "Accuracy of the best parameters using the inner CV of "
    f"the random search: {random_search.best_score_:.3f}"
)
print(f"Accuracy on test set: {test_accuracy:.3f}")
Accuracy of the best parameters using the inner CV of the random search: 0.969
Accuracy on test set: 0.965
In [49]:
def shorten_param(param_name):
    """Remove components' prefixes in param_name."""
    if "__" in param_name:
        return param_name.rsplit("__", 1)[1]
    return param_name


cv_results = pd.DataFrame(random_search.cv_results_)
cv_results = cv_results.rename(shorten_param, axis=1)
In [50]:
import plotly.express as px

param_names = [shorten_param(name) for name in parameter_grid.keys()]
labels = {
    "mean_score_time": "CV Score time (s)",
    "mean_test_score": "CV score (accuracy)",
}
fig = px.scatter(
    cv_results,
    x="mean_score_time",
    y="mean_test_score",
    error_x="std_score_time",
    error_y="std_test_score",
    hover_data=param_names,
    labels=labels,
)
fig.update_layout(
    title={
        "text": "trade-off between scoring time and mean test score",
        "y": 0.95,
        "x": 0.5,
        "xanchor": "center",
        "yanchor": "top",
    }
)
fig
00.050.10.150.20.250.30.350.40.450.840.860.880.90.920.940.96
trade-off between scoring time and mean test scoreCV Score time (s)CV score (accuracy)
plotly-logomark
In [51]:
import math

column_results = param_names + ["mean_test_score", "mean_score_time"]

transform_funcs = dict.fromkeys(column_results, lambda x: x)

transform_funcs["ngram_range"] = lambda x: x[1]

fig = px.parallel_coordinates(
    cv_results[column_results].apply(transform_funcs),
    color="mean_test_score",
    color_continuous_scale=px.colors.sequential.Viridis_r,
    labels=labels,
)
fig.update_layout(
    title={
        "text": "Parallel coordinates plot of text classifier pipeline",
        "y": 0.99,
        "x": 0.5,
        "xanchor": "center",
        "yanchor": "top",
    }
)
fig
0.840.860.880.90.920.940.96CV score (accuracy)Parallel coordinates plot of text classifier pipeline0.20.30.40.50.60.70.80.91max_df10.212345678910min_df10111.11.21.31.41.51.61.71.81.92ngram_range211k2k3k4k5k6k7k8k9k10kC10k0k0.840.860.880.90.920.940.96CV score (accuracy)0.968730.8355300.050.10.150.20.250.30.35CV Score time (s)0.395010
plotly-logomark

Visualize results of at least two text classifiers and select the most robust one¶

Logistic Regression Model is more robust.

In [105]:
import matplotlib.pyplot as plt

# Classifier names
classifiers = ['Naive Bayes', 'SVM', 'Logistic Regression']

# Corresponding accuracy scores
accuracies = [bayes_accuracy, svm_accuracy, logic_accuracy]

# Creating the enhanced bar plot
plt.figure(figsize=(10, 7))
bars = plt.bar(classifiers, accuracies, color=['skyblue', 'lightgreen', 'salmon'])

# Adding data labels above each bar
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.005, round(yval, 3), ha='center', va='bottom')

plt.xlabel('Classifier')
plt.ylabel('Accuracy Score')
plt.title('Comparison of Text Classifier Accuracies')
plt.ylim(0, 1.05)  # Extend y-axis limit slightly above 1 for better visibility of data labels
plt.yticks([i/10 for i in range(11)])  # Setting y-axis ticks to improve readability
plt.grid(axis='y', linestyle='--', alpha=0.7)  # Adding horizontal gridlines for easier comparison
plt.show()